/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.nutch.parse;

// JDK imports
import java.io.InputStream;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;

import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;

// Commons Logging imports
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

// Hadoop imports
import org.apache.hadoop.conf.Configuration;

// Nutch imports
import org.apache.nutch.util.NutchConfiguration;

/**
 * A reader to load the information stored in the
 * <code>$NUTCH_HOME/conf/parse-plugins.xml</code> file.
 * 
 * @author mattmann
 * @version 1.0
 */
public class ParsePluginsReader {

	/* our log stream */
	public static final Logger LOG = LoggerFactory
			.getLogger(ParsePluginsReader.class);

	/** The property name of the parse-plugins location */
	private static final String PP_FILE_PROP = "parse.plugin.file";

	/** the parse-plugins file */
	private String fParsePluginsFile = null;

	/**
	 * Constructs a new ParsePluginsReader
	 */
	public ParsePluginsReader() {
	}

	/**
	 * Reads the <code>parse-plugins.xml</code> file and returns the
	 * {@link #ParsePluginList} defined by it.
	 * 
	 * @return A {@link #ParsePluginList} specified by the
	 *         <code>parse-plugins.xml</code> file.
	 * @throws Exception
	 *             If any parsing error occurs.
	 */
	public ParsePluginList parse(Configuration conf) {

		ParsePluginList pList = new ParsePluginList();

		// open up the XML file
		DocumentBuilderFactory factory = null;
		DocumentBuilder parser = null;
		Document document = null;
		InputSource inputSource = null;

		InputStream ppInputStream = null;
		if (fParsePluginsFile != null) {
			URL parsePluginUrl = null;
			try {
				parsePluginUrl = new URL(fParsePluginsFile);
				ppInputStream = parsePluginUrl.openStream();
			} catch (Exception e) {
				if (LOG.isWarnEnabled()) {
					LOG.warn("Unable to load parse plugins file from URL " + "["
							+ fParsePluginsFile + "]. Reason is [" + e + "]");
				}
				return pList;
			}
		} else {
			ppInputStream = conf
					.getConfResourceAsInputStream(conf.get(PP_FILE_PROP));
		}

		inputSource = new InputSource(ppInputStream);

		try {
			factory = DocumentBuilderFactory.newInstance();
			parser = factory.newDocumentBuilder();
			document = parser.parse(inputSource);
		} catch (Exception e) {
			if (LOG.isWarnEnabled()) {
				LOG.warn("Unable to parse [" + fParsePluginsFile + "]."
						+ "Reason is [" + e + "]");
			}
			return null;
		}

		Element parsePlugins = document.getDocumentElement();

		// build up the alias hash map
		Map<String, String> aliases = getAliases(parsePlugins);
		// And store it on the parse plugin list
		pList.setAliases(aliases);

		// get all the mime type nodes
		NodeList mimeTypes = parsePlugins.getElementsByTagName("mimeType");

		// iterate through the mime types
		for (int i = 0; i < mimeTypes.getLength(); i++) {
			Element mimeType = (Element) mimeTypes.item(i);
			String mimeTypeStr = mimeType.getAttribute("name");

			// for each mimeType, get the plugin list
			NodeList pluginList = mimeType.getElementsByTagName("plugin");

			// iterate through the plugins, add them in order read
			// OR if they have a special order="" attribute, then hold those in
			// a separate list, and then insert them into the final list at the
			// order specified
			if (pluginList != null && pluginList.getLength() > 0) {
				List<String> plugList = new ArrayList<String>(
						pluginList.getLength());

				for (int j = 0; j < pluginList.getLength(); j++) {
					Element plugin = (Element) pluginList.item(j);
					String pluginId = plugin.getAttribute("id");
					String extId = aliases.get(pluginId);
					if (extId == null) {
						// Assume an extension id is directly specified
						extId = pluginId;
					}
					String orderStr = plugin.getAttribute("order");
					int order = -1;
					try {
						order = Integer.parseInt(orderStr);
					} catch (NumberFormatException ignore) {
					}
					if (order != -1) {
						plugList.add(order - 1, extId);
					} else {
						plugList.add(extId);
					}
				}

				// now add the plugin list and map it to this mimeType
				pList.setPluginList(mimeTypeStr, plugList);

			} else if (LOG.isWarnEnabled()) {
				LOG.warn(
						"ParsePluginsReader:ERROR:no plugins defined for mime type: "
								+ mimeTypeStr + ", continuing parse");
			}
		}
		return pList;
	}

	/**
	 * Tests parsing of the parse-plugins.xml file. An alternative name for the
	 * file can be specified via the <code>--file</code> option, although the
	 * file must be located in the <code>$NUTCH_HOME/conf</code> directory.
	 * 
	 * @param args
	 *            Currently only the --file argument to specify an alternative
	 *            name for the parse-plugins.xml file is supported.
	 */
	public static void main(String[] args) throws Exception {
		String parsePluginFile = null;
		String usage = "ParsePluginsReader [--file <parse plugin file location>]";

		if ((args.length != 0 && args.length != 2)
				|| (args.length == 2 && !"--file".equals(args[0]))) {
			System.err.println(usage);
			System.exit(1);
		}

		for (int i = 0; i < args.length; i++) {
			if (args[i].equals("--file")) {
				parsePluginFile = args[++i];
			}
		}

		ParsePluginsReader reader = new ParsePluginsReader();

		if (parsePluginFile != null) {
			reader.setFParsePluginsFile(parsePluginFile);
		}

		ParsePluginList prefs = reader.parse(NutchConfiguration.create());

		for (String mimeType : prefs.getSupportedMimeTypes()) {

			System.out.println("MIMETYPE: " + mimeType);
			List<String> plugList = prefs.getPluginList(mimeType);

			System.out.println("EXTENSION IDs:");

			for (String j : plugList) {
				System.out.println(j);
			}
		}

	}

	/**
	 * @return Returns the fParsePluginsFile.
	 */
	public String getFParsePluginsFile() {
		return fParsePluginsFile;
	}

	/**
	 * @param parsePluginsFile
	 *            The fParsePluginsFile to set.
	 */
	public void setFParsePluginsFile(String parsePluginsFile) {
		fParsePluginsFile = parsePluginsFile;
	}

	private Map<String, String> getAliases(Element parsePluginsRoot) {

		Map<String, String> aliases = new HashMap<String, String>();
		NodeList aliasRoot = parsePluginsRoot.getElementsByTagName("aliases");

		if (aliasRoot == null
				|| (aliasRoot != null && aliasRoot.getLength() == 0)) {
			if (LOG.isWarnEnabled()) {
				LOG.warn("No aliases defined in parse-plugins.xml!");
			}
			return aliases;
		}

		if (aliasRoot.getLength() > 1) {
			// log a warning, but try and continue processing
			if (LOG.isWarnEnabled()) {
				LOG.warn(
						"There should only be one \"aliases\" tag in parse-plugins.xml");
			}
		}

		Element aliasRootElem = (Element) aliasRoot.item(0);
		NodeList aliasElements = aliasRootElem.getElementsByTagName("alias");

		if (aliasElements != null && aliasElements.getLength() > 0) {
			for (int i = 0; i < aliasElements.getLength(); i++) {
				Element aliasElem = (Element) aliasElements.item(i);
				String parsePluginId = aliasElem.getAttribute("name");
				String extensionId = aliasElem.getAttribute("extension-id");
				if (LOG.isTraceEnabled()) {
					LOG.trace("Found alias: plugin-id: " + parsePluginId
							+ ", extension-id: " + extensionId);
				}
				if (parsePluginId != null && extensionId != null) {
					aliases.put(parsePluginId, extensionId);
				}
			}
		}
		return aliases;
	}

}
